Using R for Data Wrangling, Data Visualization, and Data Analysis

I created this R Markdown document to demonstrate examples of R Code I’ve used for work projects and personal data exploration.

Using R to create animated plots

I wrote the following code to demonstrate differences in COVID-19 hospitalizations among fully vaccinated and unvaccinated adults over time Using data provided by the New York State Department of health (https://coronavirus.health.ny.gov/covid-19-breakthrough-data)

Importing Data

USACOVID <- read.csv ("COVID_Data_R2.csv")

Loading Libraries

library(tidyverse)
library(ggplot2)
library(ggthemes)
library(gganimate)

Demonstrating Untidy Data

USACOVID
##         Month Vaccinated.with.updated.booster
## 1    8/1/2022                              NA
## 2   8/14/2022                              NA
## 3   8/21/2022                              NA
## 4   8/28/2022                              NA
## 5    9/4/2022                              NA
## 6   9/11/2022                              NA
## 7   9/18/2022                            0.00
## 8   9/25/2022                            0.21
## 9   10/2/2022                            0.12
## 10  10/9/2022                            0.11
## 11 10/16/2022                            0.22
## 12 10/23/2022                            0.18
## 13 10/30/2022                            0.23
## 14  11/6/2022                            0.20
## 15 11/13/2022                            0.29
## 16 11/20/2022                            0.31
## 17 11/27/2022                            0.37
## 18  12/4/2022                            0.43
## 19 12/11/2022                            0.40
## 20 12/18/2022                            0.50
## 21 12/25/2022                            0.53
## 22   1/1/2023                            0.44
## 23   1/8/2023                            0.39
## 24  1/15/2023                            0.33
## 25  1/22/2023                            0.30
## 26  1/29/2023                            0.31
## 27   2/5/2023                            0.26
## 28  2/12/2023                            0.28
## 29  2/19/2023                            0.31
## 30  2/26/2023                            0.23
## 31   3/5/2023                            0.21
## 32  3/12/2023                            0.16
## 33  3/19/2023                            0.17
## 34  3/26/2023                            0.17
## 35                                         NA
## 36                                         NA
## 37                                         NA
## 38                                         NA
## 39                                         NA
## 40                                         NA
## 41                                         NA
## 42                                         NA
##    Vaccinated.without.updated.booster Unvaccinated
## 1                                0.67         3.69
## 2                                0.60         3.35
## 3                                0.59         3.06
## 4                                0.61         3.03
## 5                                0.55         2.79
## 6                                0.51         2.47
## 7                                0.46         2.40
## 8                                0.44         2.51
## 9                                0.48         2.41
## 10                               0.49         2.48
## 11                               0.56         2.46
## 12                               0.59         2.85
## 13                               0.60         2.85
## 14                               0.62         2.71
## 15                               0.63         2.86
## 16                               0.73         3.24
## 17                               0.88         4.08
## 18                               0.90         4.01
## 19                               0.86         3.83
## 20                               0.93         4.03
## 21                               1.01         4.38
## 22                               0.87         3.84
## 23                               0.67         3.29
## 24                               0.55         2.32
## 25                               0.50         2.02
## 26                               0.41         2.10
## 27                               0.42         1.67
## 28                               0.39         1.38
## 29                               0.32         1.60
## 30                               0.30         1.26
## 31                               0.23         1.33
## 32                               0.21         0.95
## 33                               0.16         0.84
## 34                               0.14         0.61
## 35                                 NA           NA
## 36                                 NA           NA
## 37                                 NA           NA
## 38                                 NA           NA
## 39                                 NA           NA
## 40                                 NA           NA
## 41                                 NA           NA
## 42                                 NA           NA

Creating a new tidy data set

tidyUSACOVID <-USACOVID %>% 
  pivot_longer(cols = -Month)

Demonstrating new tidy data set

tidyUSACOVID
## # A tibble: 126 x 3
##    Month     name                               value
##    <chr>     <chr>                              <dbl>
##  1 8/1/2022  Vaccinated.with.updated.booster    NA   
##  2 8/1/2022  Vaccinated.without.updated.booster  0.67
##  3 8/1/2022  Unvaccinated                        3.69
##  4 8/14/2022 Vaccinated.with.updated.booster    NA   
##  5 8/14/2022 Vaccinated.without.updated.booster  0.6 
##  6 8/14/2022 Unvaccinated                        3.35
##  7 8/21/2022 Vaccinated.with.updated.booster    NA   
##  8 8/21/2022 Vaccinated.without.updated.booster  0.59
##  9 8/21/2022 Unvaccinated                        3.06
## 10 8/28/2022 Vaccinated.with.updated.booster    NA   
## # ... with 116 more rows

Loading lubridate package to change the “Month” column from character type variable to a date type variable

library(lubridate)

Changing variable type for “Month” variable

tidyUSACOVID$Month <- mdy(tidyUSACOVID$Month)

Demonstrating change in variable type

tidyUSACOVID
## # A tibble: 126 x 3
##    Month      name                               value
##    <date>     <chr>                              <dbl>
##  1 2022-08-01 Vaccinated.with.updated.booster    NA   
##  2 2022-08-01 Vaccinated.without.updated.booster  0.67
##  3 2022-08-01 Unvaccinated                        3.69
##  4 2022-08-14 Vaccinated.with.updated.booster    NA   
##  5 2022-08-14 Vaccinated.without.updated.booster  0.6 
##  6 2022-08-14 Unvaccinated                        3.35
##  7 2022-08-21 Vaccinated.with.updated.booster    NA   
##  8 2022-08-21 Vaccinated.without.updated.booster  0.59
##  9 2022-08-21 Unvaccinated                        3.06
## 10 2022-08-28 Vaccinated.with.updated.booster    NA   
## # ... with 116 more rows

Creating a line graph to demonstrate differences in COVID-19 death rates among fully vaccinated and unvaccinated individuals over time.

ggplot(data=tidyUSACOVID, aes(x=Month, y=value, group=name, color=name))+
  geom_line(size = 1.25)+
  ggtitle("Rates of COVID-19 Deaths by Vaccination Status in Ages 18 and Older")+
  theme_clean()+
  theme(axis.text.x=element_text(angle=60, hjust=1, face = "bold"))+
  theme(axis.title.y=element_text(size=15,face="bold"))+
  theme(plot.title = element_text(hjust = 0.5, lineheight = 0.9))+
  theme(plot.title = element_text(face = "bold", size = 15))+
  theme(axis.title.x = element_blank())+
  theme(legend.title = element_blank())+
  theme(legend.text = element_text(face = "bold", size = 10))+
  theme(legend.position = "bottom")+
  scale_y_continuous(limits=c(0,5))+
  labs(y = "Deaths per 100,000 population", subtitle = "August 1, 2022 - March 26, 2023 (23 U.S. Jurisdictions)")+
  theme(plot.subtitle = element_text(hjust=0.5))+
  scale_color_hue(labels=c('Unvaccinated', 'Vaccinated with updated booster', 'Vaccinated without updated booster'))+
  scale_x_date(date_breaks= "1 month", date_labels = "%b %Y", limit=as.Date(c('2022-08-01', '2023-03-26')), expand=c(0,0))

Creating an animated plot

tidyUSACOVID <- na.omit(tidyUSACOVID)

ggplot(data=tidyUSACOVID, aes(x=Month, y=value, group=name, color=name))+
  geom_line(size = 1.25)+
  ggtitle("Rates of COVID-19 Deaths by Vaccination Status in Ages 18 and Older")+
  theme_clean()+
  theme(axis.text.x=element_text(angle=60, hjust=1, face = "bold"))+
  theme(axis.title.y=element_text(size=15,face="bold"))+
  theme(plot.title = element_text(hjust = 0.5, lineheight = 0.9))+
  theme(plot.title = element_text(face = "bold", size = 15))+
  theme(axis.title.x = element_blank())+
  theme(legend.title = element_blank())+
  theme(legend.text = element_text(face = "bold", size = 10))+
  theme(legend.position = "bottom")+
  scale_y_continuous(limits=c(0,5))+
  labs(y = "Deaths per 100,000 population", subtitle = "August 1, 2022 - March 26, 2023 (23 U.S. Jurisdictions)")+
  theme(plot.subtitle = element_text(hjust=0.5))+
  scale_color_hue(labels=c('Unvaccinated', 'Vaccinated with updated booster', 'Vaccinated without updated booster'))+
  scale_x_date(date_breaks= "1 month", date_labels = "%b %Y", limit=as.Date(c('2022-08-01', '2023-03-26')), expand=c(0,0))+
  transition_reveal(Month)

USACOVIDCASES <- read.csv ("COVID_Data_R3.csv")
tidyUSACOVIDCASES <-USACOVIDCASES %>% 
  pivot_longer(cols = -Month)
tidyUSACOVIDCASES
## # A tibble: 126 x 3
##    Month     name                               value
##    <chr>     <chr>                              <dbl>
##  1 8/1/2022  Vaccinated.with.updated.booster      NA 
##  2 8/1/2022  Vaccinated.without.updated.booster  160.
##  3 8/1/2022  Unvaccinated                        468.
##  4 8/14/2022 Vaccinated.with.updated.booster      NA 
##  5 8/14/2022 Vaccinated.without.updated.booster  144.
##  6 8/14/2022 Unvaccinated                        425.
##  7 8/21/2022 Vaccinated.with.updated.booster      NA 
##  8 8/21/2022 Vaccinated.without.updated.booster  139.
##  9 8/21/2022 Unvaccinated                        401.
## 10 8/28/2022 Vaccinated.with.updated.booster      NA 
## # ... with 116 more rows
tidyUSACOVIDCASES$Month <- mdy(tidyUSACOVIDCASES$Month)
tidyUSACOVIDCASES
## # A tibble: 126 x 3
##    Month      name                               value
##    <date>     <chr>                              <dbl>
##  1 2022-08-01 Vaccinated.with.updated.booster      NA 
##  2 2022-08-01 Vaccinated.without.updated.booster  160.
##  3 2022-08-01 Unvaccinated                        468.
##  4 2022-08-14 Vaccinated.with.updated.booster      NA 
##  5 2022-08-14 Vaccinated.without.updated.booster  144.
##  6 2022-08-14 Unvaccinated                        425.
##  7 2022-08-21 Vaccinated.with.updated.booster      NA 
##  8 2022-08-21 Vaccinated.without.updated.booster  139.
##  9 2022-08-21 Unvaccinated                        401.
## 10 2022-08-28 Vaccinated.with.updated.booster      NA 
## # ... with 116 more rows
ggplot(data=tidyUSACOVIDCASES, aes(x=Month, y=value, group=name, color=name))+
  geom_line(size = 1.25)+
  ggtitle("Rates of COVID-19 Cases by Vaccination Status in Ages 18 and Older")+
  theme_clean()+
  theme(axis.text.x=element_text(angle=60, hjust=1, face = "bold"))+
  theme(axis.title.y=element_text(size=15,face="bold"))+
  theme(plot.title = element_text(hjust = 0.5, lineheight = 0.9))+
  theme(plot.title = element_text(face = "bold", size = 15))+
  theme(axis.title.x = element_blank())+
  theme(legend.title = element_blank())+
  theme(legend.text = element_text(face = "bold", size = 10))+
  theme(legend.position = "bottom")+
  scale_y_continuous(limits=c(0,600))+
  labs(y = "Cases per 100,000 population", subtitle = "August 1, 2022 - April 16, 2023 (24 U.S. Jurisdictions)")+
  theme(plot.subtitle = element_text(hjust=0.5))+
  scale_color_hue(labels=c('Unvaccinated', 'Vaccinated with updated booster', 'Vaccinated without updated booster'))+
  scale_x_date(date_breaks= "1 month", date_labels = "%b %Y", limit=as.Date(c('2022-08-01', '2023-04-16')), expand=c(0,0))

```